package org.yinxue.spider.core.parser;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.yinxue.spider.core.model.ATag;
import org.yinxue.spider.core.model.ImgTag;
import org.yinxue.spider.core.util.SetArrayList;
import org.yinxue.spider.core.util.StringUtils;
import org.yinxue.spider.core.util.UrlUtils;

import java.util.*;

/**
 * a标签解析 <br>
 *
 * @author zengjian
 * @create 2018-07-10 16:25
 * @since 1.0.0
 */
public class Parser {

    interface ElementHandler<T> {
        List<T> handleElement(Elements elements, String baseUrl);
    }

    public List<ImgTag> parseImgTag(String html, final String baseUrl) {
        return doParse(html, baseUrl, "img[src]", new ElementHandler<ImgTag>() {
            @Override
            public List<ImgTag> handleElement(Elements elements, String baseUrl) {
                Set<String> singleUrls = new HashSet<>();
                List<ImgTag> list = new ArrayList<>();
                for (Element element : elements) {
                    String text = element.text();
                    String src = element.attr("src");
                    // 过滤错误url
                    if (StringUtils.isEmpty(src) || StringUtils.startsWith(src, "javascript", "#", "mailto")) {
                        continue;
                    }
                    // 去除前后空格，避免 new HttpGet(url)报错
                    String url = src.trim().replace(" ", "%2B");

                    // 相对地址合并，弃用Jsoup原生
                    url = UrlUtils.parseAbsoluteUrl(url, baseUrl);

                    // url去重
                    if (!singleUrls.contains(url)) {
                        ImgTag imgTag = new ImgTag();
                        imgTag.setUrl(url);
                        imgTag.setSrc(src);
                        imgTag.setSrcUrl(baseUrl);
                        list.add(imgTag);
                        singleUrls.add(url);
                    }
                }
                // 按url字母排序
                Collections.sort(list);
                return list;
            }
        });
    }

    /**
     * baseUrl需要由外部先解析好
     * @param html
     * @param baseUrl
     * @return
     */
    public List<ATag> parseATag(String html, String baseUrl) {
        return doParse(html, baseUrl, "a[href]", new ElementHandler<ATag>() {
            @Override
            public List<ATag> handleElement(Elements elements, String baseUrl) {
                List<ATag> list = new SetArrayList<>();
                Set<String> singleUrls = new HashSet<>();
                for (Element element : elements) {
                    String text = element.text();
                    String href = element.attr("href");
                    // 过滤错误url
                    if (StringUtils.isEmpty(href) || StringUtils.startsWith(href, "javascript", "#", "mailto")) {
                        continue;
                    }
                    // 去除前后空格，替换空格为%2B 避免 new HttpGet(url)报错
                    String parsrUrl = href.trim().replace(" ", "%2B");

                    // 相对地址合并，弃用Jsoup原生
                    if (UrlUtils.isRelationUrl(parsrUrl)){
                        parsrUrl = UrlUtils.parseAbsoluteUrl(parsrUrl, baseUrl);
                    }

                    // url去重
                    if (!singleUrls.contains(parsrUrl)) {
                        ATag atag = new ATag();
                        atag.setOuterHtml(element.outerHtml());
                        atag.setHref(href);
                        atag.setUrl(parsrUrl);
                        atag.setText(text);
                        atag.setSrcUrl(baseUrl);
                        list.add(atag);
                        singleUrls.add(parsrUrl);
                    }
                }
                // 按url字母排序
                Collections.sort(list);
                return list;
            }
        });
    }

    protected <T> List<T> doParse(String html, String seedUrl, String cssQuery, ElementHandler<T> handler) {
        if (StringUtils.isEmpty(html)) {
            return Collections.emptyList();
        }
        Elements elements = Jsoup.parse(html).select(cssQuery);
        return handler.handleElement(elements, seedUrl);
    }
}
